# -*-coding:utf-8-*-
__author__ = 'Administrator'
from bs4 import BeautifulSoup
import urllib
import urllib2
import re
import urlparse

root_url = 'http://www.5iweb.com.cn' #目标网站

request = urllib2.Request(root_url)
response = urllib2.urlopen(request)
content = response.read()

#匹配出A链接为结尾是数字的

#ff = re.findall(r"\".+\d+.html\"",content)
soup  = BeautifulSoup(content,'html.parser',from_encoding='utf-8')
links = soup.find_all("a",href=re.compile(r"\d+\.htm"))

new_urls = set();

for link in links:
    new_url = link['href']
    #new_urls
    new_full_url=urlparse.urljoin(root_url,new_url)
    details = urllib2.Request(new_full_url)
    detailsres = urllib2.urlopen(details)

    detailsContent = detailsres.read()
    detailssoup = BeautifulSoup(detailsContent,'html.parser',from_encoding='utf-8')
    downUrl = detailssoup.find("a",href=re.compile(r".+\.zip"))
    down_full_url = urlparse.urljoin(root_url,downUrl['href'])
    #print down_full_url
    file_name = detailssoup.find("div", class_="subTitle").h2
    print down_full_url
    #urllib.urlretrieve(down_full_url,downName )




#print new_urls